0.1 Variable descriptions

0.2 Read in datasets

Language codes (from WALS) – WALS, ISO, ascii-name. These are used to merge across datasets.

codes = read.csv("../data/language_codes.csv") %>%
  select(language, WALS, ISO) %>%
  mutate(ISO = unlist(lapply(strsplit(as.character(ISO),
                                      ","),function(x) x[1])))
# in cases where there are two ISO codes, takes first

Lupyan & Dale (2010): Demographic variables and syntactic compelxity

wichmann_ISOS = read.csv("../data/ISO_mappings_from_wichmann.csv", 
                         stringsAsFactors=F)[-1] %>%
  unique() 

ld = read.table("../data/lupyan_2010.txt", fill = T, 
                header = T, sep = "\t", na.strings = "*") %>%
  left_join(codes, c("walsCode" = "WALS")) %>%
  mutate(lang = tolower(lang)) %>%
  left_join(wichmann_ISOS, by = "lang") %>% # use wichmann ISOS for some missing
  mutate(ISO = as.factor(ifelse(is.na(ISO.x), ISO.y, ISO.x))) %>%
  select(-ISO.y, -ISO.x) %>%
  filter(!is.na(ISO)) %>%
  filter(ISO != "")

# rename columns
ld = rename(ld, pop_log = logpop2, 
         mean.temp = aveTemp,
         n.neighbors = numNeighbors,
         sum.precip = sumPrecip,
         sd.temp = sdTemp,
         sd.precip = sdPrecip,
         lang.family = langFamily,
         lang.genus = langGenus,
         native.country = nativeCountry,
         native.country.area = nativeCountryArea,
         growing.season = growingSeason) 

# get means across language
ld.demo = ld %>%
  group_by(ISO) %>%
  summarise_each(funs(mean(., na.rm = TRUE)), 
                 c(8:9, 16:121))  %>%
  select(1:3, 93:95, 100:101, 103:105, 108)

# add in data family
fams = ld %>%
  group_by(ISO) %>%
  filter(row_number() == 1) %>%
  select(ISO, lang.family, lang.genus, native.country,
         native.country.area, language)  

ld.demo = left_join(fams, ld.demo, by = "ISO") %>% ungroup()

WALS features of complexity

# get variables to include 
qual = ld %>%
  select(18:103, 124) %>%
  mutate_each(funs(as.factor)) %>%
  group_by(ISO) %>%
  summarise_each(funs(most.frequent.level)) %>%
  mutate_each(funs(as.factor)) 

ld_feature_names = read.csv("../data/lupyan_2010_all_feature_mappings.csv") 
qualVarNames = intersect(names(qual), ld_feature_names$WALS.feature.name)
qual = select(qual,which(is.element(names(qual), c("ISO", qualVarNames))))

# remap factor levels to complexity values (0,1)
# note two variables that are reported in the paper (NICPOC[59] and CYSIND[39] are missing in the data)

for (i in 1:length(qualVarNames)){
  thisVarLabs = ld_feature_names[ld_feature_names$WALS.feature.name == qualVarNames[i],]
  old = thisVarLabs$ld.level.label
  if (is.na(old)) {print("NA!!!")}
  new = thisVarLabs$ld.complexity
  col_i = grep(qualVarNames[i], colnames(qual))
  qual[,col_i] = mapvalues(as.matrix(qual[,col_i]), 
                           from = as.character(old),
                           to = as.character(new), warn_missing = TRUE)
}

# [1716]
ld.complexity = qual %>%
  mutate_each(funs(as.numeric), -ISO) %>%
  gather(variable, complexity.level, 2:28) %>%
  group_by(ISO) %>%
  summarise(morphological.complexity = sum(complexity.level))

ld.demo.qual = left_join(ld.demo, ld.complexity)

Bentz, et al. (2015): L2 learners and lexical diversity.Note here that we are only looking at languages from the UDHR corpus. The other corpora have smaller languages only.

bentz = read.csv("../data/bentz_2015.csv") %>%
  gather(temp, LDT, starts_with("LDT")) %>% 
  unite(temp1, measure, temp, sep = ".") %>% 
  spread(temp1, LDT) %>%
  filter(text == "UDHR") %>%
  select(iso_639_3, RatioL2, a.LDTscaled, 
         H.LDTscaled, TTR.LDTscaled, Stock,
         Region, L1_speakers, L2_speakers) %>%
  rename(ISO = iso_639_3,
         n.L1.speakers = L1_speakers,
         n.L2.speakers = L2_speakers,
         ratio.L2.L1 = RatioL2,
         stock = Stock,
         region = Region,
         scaled.LDT.TTR = TTR.LDTscaled,
         scaled.LDT.ZM = a.LDTscaled,
         scaled.LDT.H = H.LDTscaled) 

# n_speakers taken directly from ethnologue website by ML for missing languages that were high-frequency across the datasets
ethno_sup = read.csv("../data/supplementary_enthnologue.csv")
bentz = full_join(bentz, ethno_sup) %>%
        mutate(ISO = as.factor(ISO))

Atkinson (2011)

atkinson = read.csv("../data/atkinson_2011.csv") %>%
  select(normalized.vowel.diversity,
         normalized.consonant.diversity,    
         normalized.tone.diversity,
         normalized.phoneme.diversity, ISO, 
         distance.from.origin) %>%
  mutate(ISO = as.factor(unlist(lapply(strsplit(as.character(ISO),
                                      " "),function(x) x[1])))) %>%
  distinct(ISO) 

Lewis & Frank (under review): Complexity Bias

# complexity bias
cb = read.csv("../data/lewis_2015.csv") %>%
  rename(complexity.bias = corr,
         p.complexity.bias = p.corr,
         mono.complexity.bias = mono.cor,
         open.complexity.bias = open.cor) %>%
  left_join(codes, by="language") %>%
  select(-X.1, -X, -lower.ci, -upper.ci, -checked,
         -mean.length) %>%
  distinct(ISO) %>%
  filter(language != "english") %>% # english is an outlier in this dataset because norms were colelcted in english
  select(-language, -WALS)

Futrell, Mahowald, & Gibson (2015): Dependency length

dl = read.csv("../data/futrell_2015.csv") %>%
  left_join(codes, by="language") %>%
  select(-language, -WALS, -fixed.random.baseline.slope, 
         -observed.slope, -m_rand) %>%
  rename(mean.dependency.length = m_obs) %>%
  filter(!is.na(ISO))

Pellegrino, Coupe, & Marsico (2015): Information density

uid = read.csv("../data/pellegrino_2015.csv")  %>%
  left_join(codes, by="language") %>%
  select(-language, -WALS)

Moran, McCloy, & Wright (2012): Phonemic inventory

phoneme = read.csv("../data/moran_2012.csv") %>%
  select(ISO, pho, con, vow, son, 
         obs, mon, qua, ton) %>%
  rename(n.phonemes = pho,
         n.consonants = con,
         n.vowels = vow,
         n.sonorants = son,
         n.obstruents = obs,
         n.monophthongs = mon,
         n.qual.monophthongs = qua,
         n.tones = ton)

Wichmann, et al. (2013): Mean word length

# note is this IPA? - fix this!
# [308 low frequency language have no ISO, leaving 4421 (TOTAL:4729)]
ml.raw = read.csv("../data/wichmann_2013.csv") %>% 
  select(1,1:109) %>%
  gather(word,translation,I:name) %>%
  mutate(nchar = unlist(
    lapply(
      lapply(
        strsplit(
          gsub("[[:space:]]", "", translation) ,
          ","), 
        nchar), mean))) %>%
  filter(translation != "")

# subset to only those words in the swadesh list (n = 40)
swadesh.words  = ml[ml$ISO == "gwj", "word"] 
ml.raw = ml.raw %>%
  filter(is.element(word, swadesh.words))

ml = ml.raw %>%
  group_by(ISO) %>%
  summarize(mean.length = mean(nchar, na.rm = T)) %>%
  filter(ISO != "")

#more_ISOs = ml.raw %>%
#  select(ISO, names) %>%
#  mutate(names = tolower(names))

Luniewska, et al. (2015): Age of acquistion (Aoa)

aoa.raw = read.csv("../data/luniewska_2015.csv", 
                   header = T, fileEncoding = "latin1")

aoa = aoa.raw %>%
  gather(language_aoa, aoa, grep("aoa", 
                                 names(aoa.raw))) %>%
  mutate(language_aoa = tolower(unlist(lapply(strsplit(
    as.character(language_aoa),"_"), function(x) x[1])))) %>%
  select(-3:-27) %>%
  rename(language = language_aoa) %>%
  left_join(codes, by="language") %>%
  mutate(language = as.factor(language)) %>%
  group_by(ISO) %>%
  summarize(mean.aoa = mean(aoa, na.rm = T)) %>%
  filter(!is.na(ISO))

Merge data frames together by ISO

d = full_join(ld.demo.qual, cb, by = "ISO") %>%
  full_join(bentz, by = "ISO") %>%
  full_join(phoneme, by = "ISO") %>%
  full_join(ml, by = "ISO") %>%
  full_join(dl, by = "ISO") %>%
  full_join(uid, by = "ISO") %>%
  full_join(aoa, by = "ISO") %>%
  full_join(atkinson, by = "ISO") %>%
  mutate(ISO = as.factor(ISO)) %>%
  filter(!is.na(native.country)) # this is important to have for the mixed effects models

load data

# write.csv(d, "../data/langLearnVar_data.csv")
d = read.csv("../data/langLearnVar_data.csv")[,-1]

0.3 Data cleaning

Remove outliers and check for normality

d.clean = d %>%
  mutate_each(funs(removeOutlier(.,N_EXCLUDE_SDS)), c(-ISO, -lang.family, 
              -native.country, -native.country.area, 
         -lang.genus, -language, -stock, -region))

d.clean %>%
  select(-ISO, -lang.family, -native.country, -native.country.area, 
         -lang.genus, -language, -stock, -region) %>%
  gather(var, value) %>%
  ggplot(aes(sample = value)) + 
  stat_qq(na.rm = T, size = .2) + 
  facet_wrap( ~ var, scales = "free") +
  theme_bw()

The following variables look right-skewed.

NN_vars = c("area", "perimeter", "n.neighbors", "sd.precip",
           "ratio.L2.L1", "n.L1.speakers", "n.L2.speakers",
           "n.phonemes", "n.consonants","n.vowels", "n.sonorants",
           "n.obstruents", "n.monophthongs", "n.qual.monophthongs",
           "n.tones")

Take the log of these variables, and check for normality again.

d.clean = d.clean %>%
  mutate_each(funs(log = log(.), dummy =  sum(.)),
              one_of(NN_vars)) %>%
  select(-contains("dummy")) %>%
  select(-one_of(NN_vars))

d.clean %>%
  select(-ISO, -lang.family, -native.country, -native.country.area, 
         -lang.genus, -language, -stock, -region) %>%
  gather(var, value) %>%
  ggplot(aes(sample = value)) + 
  stat_qq(na.rm = T, size = .2) + 
  facet_wrap( ~ var, scales = "free") +
  theme_bw()

Check for normality of variables using Shapiro-Wilk Normality Test

is.na(d.clean) <- sapply(d.clean, is.infinite)

normality.test = d.clean %>%
  summarise_each(funs(unlist(tidy(shapiro.test(.))[2])),
                 c(-ISO, -lang.family, -native.country,
                   -native.country.area,
                     -lang.genus, -language, -stock, -region)) %>%
  gather(variable, shapiro.p.value) %>%
  mutate(normal = ifelse(shapiro.p.value > .05, TRUE, FALSE)) %>%
  summary()

By this test, only 11 variables normal. But, I think this is the best we can do (?).

Look at histograms of all variables.

demo_vars = c("lat", "lon", "perimeter_log", "n.neighbors_log",
              "mean.temp", "sum.precip",
              "sd.temp", "growing.season", "pop_log", "area_log",
              "n.neighbors_log", "sd.precip_log", "ratio.L2.L1_log",
              "n.L1.speakers_log",
              "n.L2.speakers_log", "distance.from.origin")

lang_vars = c(setdiff(names(d.clean), demo_vars))[-c(1:6, 22,23)]

d.clean %>%
  gather(variable, num, lat:length(d.clean))  %>%
  filter(variable != "stock", variable != "region") %>%
  mutate(var_type = ifelse(is.element(variable, demo_vars),
                           "demo", "lang"),
         num = as.numeric(as.character(num))) %>%
  arrange(var_type) %>% 
  mutate(variable = factor(variable, variable)) %>%
  ggplot(aes(x = num, fill = var_type)) + 
    geom_histogram(position = "identity") +
    facet_wrap( ~ variable, scales = "free") + 
    theme_bw()

Number of languages we have data, for each variable

d.clean %>%
  gather(variable, num, lat:length(d.clean))  %>%
  filter(variable != "stock", variable != "region") %>%
  group_by(variable) %>%
  summarize(counts = length(which(is.na(num) == F)))  %>%
  mutate(var_type = ifelse(is.element(variable, demo_vars),
                           "demo", "lang")) %>%
  mutate(counts_trunc = ifelse(counts > 150, 150, counts))  %>%
  arrange(var_type) %>% 
  mutate(variable = factor(variable, variable)) %>%
  ggplot(aes(y=counts_trunc, x = variable, fill = var_type)) + 
    geom_bar(stat = "identity") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "none") +
    ylab("Number of languages (truncated at 150)") +
    ggtitle("Number languages per measure")

Look at number of languages intersecting each variable

arealControl_vars = c("stock", "region", "lang.family", "lang.genus",
                      "native.country", "native.country.area")

d.clean %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), c(lang_vars, arealControl_vars)))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars))) %>%
  group_by(lang_measure, pop_measure) %>%
  mutate(both_not_na = !is.na(lang_value) & !is.na(pop_value)) %>%
  summarise(n_languages = length(which(both_not_na == TRUE))) %>%
  ggplot(aes(pop_measure, lang_measure, group = lang_measure)) +
      geom_tile(aes(fill = n_languages)) + 
      geom_text(aes(fill = n_languages, label = n_languages)) +
      scale_fill_gradient(low = "white", high = "red")  +
      xlab("population variables") + 
      ylab("language variables") + 
      ggtitle("Number languages between variables") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))

0.4 Relationship between language and demographic variables

Fits controling for family in a mixed effect model

# write.csv(d.clean,"../data/langLearnVar_data_clean.csv")
# d.clean = read.csv("../data/langLearnVar_data_clean.csv")[-1]

# these are the variables we care about for the paper
demo_vars_crit = c("n.neighbors_log",
              "mean.temp", "sum.precip", "sd.temp",
              "pop_log", "area_log", "n.neighbors_log",
              "sd.precip_log", "ratio.L2.L1_log")

lang_vars_crit = c("p.complexity.bias","scaled.LDT.TTR",
                   "mean.dependency.length", "mean.length",
                   "mean.aoa", "n.consonants_log", "n.vowels_log",
                   "morphological.complexity")

d.scatter.fam = d.clean %>% 
  select(ratio.L2.L1_log, pop_log,
         n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip,
         sd.precip_log, n.consonants_log, n.vowels_log, mean.length,
         p.complexity.bias, scaled.LDT.TTR, morphological.complexity, 
         mean.aoa, lang.family, native.country) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) +
                 (1|native.country), data=.))) 

line.fits = d.model.fits %>%
          filter(term == "pop_value" | term == "(Intercept)") %>%
  select(-std.error, -statistic, -group) %>%
  spread(term, estimate) %>%
  rename(s = pop_value)
names(line.fits)[3] = "i"

sigs = d.model.fits %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

# plot
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = sigs, aes(fill = sig.col), 
          xmin = -Inf, xmax = Inf,
          ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_abline(line.fits, mapping = aes(slope = s, intercept = i), size = 1, color = "green") +
  facet_grid(lang_measure ~ pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99", "red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

0.5 Relationship between language and demographic variables, with PCA

Look at correlations between PCA variables

# get demographic variables only (excluding L2 because intersections contains only 40 languages)
d.demo =  d.clean %>%
  select(lat, mean.temp, sd.temp, sum.precip, sd.precip_log, 
         area_log, pop_log, n.neighbors_log) %>% 
  mutate(lat = abs(lat))

is.na(d.demo) <- sapply(d.demo, is.infinite) 

demo.corr = cor(d.demo, use = "pairwise.complete")
abs(demo.corr) > CORR_THRESHOLD
##                   lat mean.temp sd.temp sum.precip sd.precip_log area_log
## lat              TRUE      TRUE   FALSE      FALSE          TRUE    FALSE
## mean.temp        TRUE      TRUE   FALSE      FALSE         FALSE    FALSE
## sd.temp         FALSE     FALSE    TRUE      FALSE         FALSE    FALSE
## sum.precip      FALSE     FALSE   FALSE       TRUE         FALSE    FALSE
## sd.precip_log    TRUE     FALSE   FALSE      FALSE          TRUE    FALSE
## area_log        FALSE     FALSE   FALSE      FALSE         FALSE     TRUE
## pop_log         FALSE     FALSE   FALSE      FALSE         FALSE    FALSE
## n.neighbors_log FALSE     FALSE   FALSE      FALSE         FALSE    FALSE
##                 pop_log n.neighbors_log
## lat               FALSE           FALSE
## mean.temp         FALSE           FALSE
## sd.temp           FALSE           FALSE
## sum.precip        FALSE           FALSE
## sd.precip_log     FALSE           FALSE
## area_log          FALSE           FALSE
## pop_log            TRUE           FALSE
## n.neighbors_log   FALSE            TRUE
# exclude lat (correlated with mean.temp, sd.precip_log) [threshold: > .8]
d.demo =  select(d.demo, -lat)

# do pca
pca.demo = prcomp(na.omit(d.demo), scale = TRUE)
barplot(pca.demo$sdev/pca.demo$sdev[1])

pca.demo = prcomp(na.omit(d.demo), scale = TRUE, tol = .6)

# look at variance explained
fviz_eig(pca.demo, addlabels = TRUE, 
               linecolor ="red", geom = "line") +
  theme_bw()

# plot loadings
pcs.demo <- cbind(names(d.demo),
                    gather(as.data.frame(pca.demo$rotation), 
                           pc, value))
names(pcs.demo)[1] = "variable"
 
pcs.demo$variable = factor(pcs.demo$variable, levels = pcs.demo$variable[1:8]) 

# order variables
pcs.demo$magnitude = ifelse(abs(pcs.demo$value)>.2, TRUE, FALSE)
ggplot(pcs.demo) +
  geom_bar(aes(x = variable, y = value,
               fill = magnitude), stat="identity") +
  facet_wrap(~pc) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  + 
       scale_fill_manual(name = "Magnitude", 
                       values = c("grey","red1")) +
  ggtitle("PCA loadings")

# merge PCAs with full dataset for model fits
d.demo[!is.na(d.demo)] = 0
d.demo[is.na(d.demo)] = 1
good_is = which(rowSums(d.demo) == 0)
d.pca.demo = d.clean[good_is,] %>% cbind(pca.demo$x)

# flip size PC so can be interpretted as "bigness"
d.pca.demo$PC2 = -d.pca.demo$PC2  

# rename PCs
names(d.pca.demo)[names(d.pca.demo) == "PC1"] = "hot"
names(d.pca.demo)[names(d.pca.demo) == "PC2"] = "big"

Principle components by language

ggplot(d.pca.demo) +   
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat, 
               color = hot), size=3) +
  scale_colour_gradient(high="green", low = "white") +
  ggtitle("Hot (PC1)") + 
  mapTheme

ggplot(d.pca.demo) +   
  borders("world", colour="gray50", fill="gray50") +
  geom_point(aes(x = lon, y=lat, 
               color = big), size=3) +
  scale_colour_gradient(high="blue", low = "white") +
  ggtitle("Big (PC2)") + 
  mapTheme

Model fits using PCA

demo_vars_crit2 = c("hot", "big")

lang_vars_crit2 = c("morphological.complexity",
                    "p.complexity.bias","scaled.LDT.TTR",
                    "mean.length", "mean.aoa",
                    "n.consonants_log", "n.vowels_log")

d.scatter.fam = d.pca.demo %>% 
  select(n.consonants_log, n.vowels_log, 
         mean.length, p.complexity.bias, scaled.LDT.TTR,
         morphological.complexity,
         lang.family, native.country, big, hot) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit2))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit2)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) + (1|native.country),
               data=.)))

line.fits = d.model.fits %>%
          filter(term == "pop_value" | term == "(Intercept)") %>%
  select(-std.error, -statistic, -group) %>%
  spread(term, estimate) %>%
  rename(s = pop_value)
names(line.fits)[3] = "i"

sigs = d.model.fits %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = sigs, aes(fill = sig.col), 
        xmin = -Inf, xmax = Inf,
        ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_abline(line.fits, mapping = aes(slope = s, intercept = i), size = 1, color = "green") +
  facet_grid(lang_measure~pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

0.6 Relationship between PC on language variables

Look at correlations between PCA variables

d.lang =  d.clean %>%
  select(morphological.complexity, scaled.LDT.H, 
         mean.length, n.consonants_log, n.vowels_log) 
is.na(d.lang) <- sapply(d.lang, is.infinite) 

lang.corr = cor(d.lang, use = "pairwise.complete")
abs(lang.corr) > CORR_THRESHOLD
##                          morphological.complexity scaled.LDT.H mean.length
## morphological.complexity                     TRUE        FALSE       FALSE
## scaled.LDT.H                                FALSE         TRUE       FALSE
## mean.length                                 FALSE        FALSE        TRUE
## n.consonants_log                            FALSE        FALSE       FALSE
## n.vowels_log                                FALSE        FALSE       FALSE
##                          n.consonants_log n.vowels_log
## morphological.complexity            FALSE        FALSE
## scaled.LDT.H                        FALSE        FALSE
## mean.length                         FALSE        FALSE
## n.consonants_log                     TRUE        FALSE
## n.vowels_log                        FALSE         TRUE
# include all language variables
# do pca
pca.lang = prcomp(na.omit(d.lang), scale = TRUE)
barplot(pca.lang$sdev/pca.lang$sdev[1])

# look at variance explained
fviz_eig(pca.lang, addlabels = TRUE, 
               linecolor ="red", geom = "line") +
  theme_bw()

pca.lang = prcomp(na.omit(d.lang), scale = TRUE, tol = .7)

# plot loadings
pcs.lang <- cbind(names(d.lang),
                    gather(as.data.frame(pca.lang$rotation), 
                           pc, value))
names(pcs.lang)[1] = "variable"
 
pcs.lang$variable =  factor(pcs.lang$variable , levels = pcs.lang$variable[1:8]) # order variables
pcs.lang$magnitude = ifelse(abs(pcs.lang$value)>.2, TRUE, FALSE)
ggplot(pcs.lang) +
  geom_bar(aes(x = variable, y = value,
               fill = magnitude), stat = "identity") +
  facet_wrap(~pc) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  + 
       scale_fill_manual(name = "Magnitude", 
                       values = c("grey","red1")) +
  ggtitle("PCA loadings")

# merge PCAs with full dataset for model fits
d.lang[!is.na(d.lang)] = 0
d.lang[is.na(d.lang)] = 1
good_is = which(rowSums(d.lang) == 0)
d.pca.lang = d.clean[good_is,] %>% cbind(pca.lang$x)

names(d.pca.lang)[names(d.pca.lang) == "PC1"] = "PC1_L"
names(d.pca.lang)[names(d.pca.lang) == "PC2"] = "PC2_L"

Model fits using PCA - all demographic to predict lang principle components

lang_vars_crit2 = c("PC1_L", "PC2_L")

demo_vars_crit2 = c("n.neighbors_log",
              "mean.temp", "sum.precip", "sd.temp",
              "pop_log", "area_log", "n.neighbors_log",
              "sd.precip_log", "ratio.L2.L1_log")

d.scatter.fam = d.pca.lang %>% 
  select(ratio.L2.L1_log, pop_log,
         n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip, 
         PC1_L, PC2_L, lang.family, native.country) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit2))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit2)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value +
                 (1|native.country), data=.))) 

line.fits = d.model.fits %>%
          filter(term == "pop_value" | term == "(Intercept)") %>%
  select(-std.error, -statistic, -group) %>%
  spread(term, estimate) %>%
  rename(s = pop_value)
names(line.fits)[3] = "i"

sigs = d.model.fits %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup


ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = sigs, aes(fill = sig.col), 
          xmin = -Inf, xmax = Inf,
          ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_abline(line.fits, mapping = aes(slope = s, intercept = i), size = 1, color = "green") +
  facet_grid(lang_measure~pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

All PCAs

all.pca = left_join(d.pca.demo, d.pca.lang[,c("PC1_L", "PC2_L", "ISO")],
                    by = "ISO")

demo_vars_crit2 = c("hot", "big")
lang_vars_crit2 = c("PC1_L", "PC2_L")

d.scatter.fam = all.pca %>% 
  select(PC1_L, PC2_L, hot, big, 
         lang.family, native.country, ISO) %>%
  gather(lang_measure, lang_value, 
         which(is.element(names(.), lang_vars_crit2))) %>%
  group_by(lang_measure) %>%
  gather(pop_measure, pop_value,
         which(is.element(names(.), demo_vars_crit2)))

is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite) 
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
                        !is.na(lang_value))

# get model fits
d.model.fits = d.scatter.fam %>%
  group_by(lang_measure, pop_measure) %>%
  do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) +
                 (1|native.country), data=.))) 

line.fits = d.model.fits %>%
          filter(term == "pop_value" | term == "(Intercept)") %>%
  select(-std.error, -statistic, -group) %>%
  spread(term, estimate) %>%
  rename(s = pop_value)
names(line.fits)[3] = "i"

sigs = d.model.fits %>%
  filter(term == "pop_value") %>%
  mutate(sig.col = ifelse(statistic > 1.96, "pos",
                          ifelse(statistic < -1.96, "neg",
                                 "none"))) %>%
  mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
  ungroup

as.data.frame(d.model.fits)
##    lang_measure pop_measure                                  term
## 1         PC1_L         hot                           (Intercept)
## 2         PC1_L         hot                             pop_value
## 3         PC1_L         hot         sd_(Intercept).native.country
## 4         PC1_L         hot            sd_(Intercept).lang.family
## 5         PC1_L         hot              sd_pop_value.lang.family
## 6         PC1_L         hot cor_(Intercept).pop_value.lang.family
## 7         PC1_L         hot               sd_Observation.Residual
## 8         PC1_L         big                           (Intercept)
## 9         PC1_L         big                             pop_value
## 10        PC1_L         big         sd_(Intercept).native.country
## 11        PC1_L         big            sd_(Intercept).lang.family
## 12        PC1_L         big              sd_pop_value.lang.family
## 13        PC1_L         big cor_(Intercept).pop_value.lang.family
## 14        PC1_L         big               sd_Observation.Residual
## 15        PC2_L         hot                           (Intercept)
## 16        PC2_L         hot                             pop_value
## 17        PC2_L         hot         sd_(Intercept).native.country
## 18        PC2_L         hot            sd_(Intercept).lang.family
## 19        PC2_L         hot              sd_pop_value.lang.family
## 20        PC2_L         hot cor_(Intercept).pop_value.lang.family
## 21        PC2_L         hot               sd_Observation.Residual
## 22        PC2_L         big                           (Intercept)
## 23        PC2_L         big                             pop_value
## 24        PC2_L         big         sd_(Intercept).native.country
## 25        PC2_L         big            sd_(Intercept).lang.family
## 26        PC2_L         big              sd_pop_value.lang.family
## 27        PC2_L         big cor_(Intercept).pop_value.lang.family
## 28        PC2_L         big               sd_Observation.Residual
##       estimate std.error  statistic          group
## 1  -0.37000427 0.3299834 -1.1212816          fixed
## 2  -0.52775055 0.1644584 -3.2090223          fixed
## 3   1.08005388        NA         NA native.country
## 4   0.44197059        NA         NA    lang.family
## 5   0.13652068        NA         NA    lang.family
## 6  -1.00000000        NA         NA    lang.family
## 7   0.36586426        NA         NA       Residual
## 8   1.26222285 0.6152624  2.0515197          fixed
## 9  -0.47176860 0.2272726 -2.0757826          fixed
## 10  0.79365500        NA         NA native.country
## 11  1.38940145        NA         NA    lang.family
## 12  0.35905929        NA         NA    lang.family
## 13 -0.84421344        NA         NA    lang.family
## 14  0.59040582        NA         NA       Residual
## 15  0.65995752 0.4923687  1.3403727          fixed
## 16  0.11946456 0.2927700  0.4080492          fixed
## 17  0.13811957        NA         NA native.country
## 18  0.93961636        NA         NA    lang.family
## 19  0.63889384        NA         NA    lang.family
## 20  1.00000000        NA         NA    lang.family
## 21  1.00289185        NA         NA       Residual
## 22 -0.34963912 0.3650751 -0.9577183          fixed
## 23  0.23602945 0.1407070  1.6774534          fixed
## 24  0.00000000        NA         NA native.country
## 25  0.63987459        NA         NA    lang.family
## 26  0.07484889        NA         NA    lang.family
## 27  1.00000000        NA         NA    lang.family
## 28  1.05552191        NA         NA       Residual
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
  geom_rect(data = sigs, aes(fill = sig.col),
          xmin = -Inf, xmax = Inf,
          ymin = -Inf, ymax = Inf, alpha = 0.2) +
  geom_point(size = .3) +
  geom_abline(line.fits, mapping = aes(slope = s, intercept = i), size = 1, color = "green") +
  facet_grid(lang_measure~pop_measure, scales = "free") +
  scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
  theme_bw() +
  xlab("Demographic variables") +
  ylab("Language variables") +
  theme(legend.position = "none")

0.7 Replicate Lupyan and Dale (2010) complexity score plot (Fig. 3)

ld.demo.qual %>%
  mutate(morphological.complexity.factor =
           as.factor(morphological.complexity)) %>%
  group_by(morphological.complexity.factor) %>%
  multi_boot(column="pop_log",
              summary_groups = c("morphological.complexity.factor"),
              statistics_functions = c("mean", "ci_lower","ci_upper")) %>%
  mutate(morphological.complexity =  
           as.numeric(as.character(morphological.complexity.factor))) %>%
  ggplot() +
  geom_pointrange(aes(x= morphological.complexity.factor,
                      y = mean, ymin = ci_lower, ymax = ci_upper)) + 
  geom_smooth(method = "lm",aes(morphological.complexity-5, mean)) +
  # theres a weird offset because complexity is a factor?
  ylab("Log population") + 
  xlab("Morphological complexity") +
  ggtitle("Lupyan and Dale (2010) Fig. 3 reproduction") +
  theme_bw() 

0.8 Relationship between language variables for AOA

# predicted relationship with AOA  with TTR and number of consonants

# morphological complexity
tidy(lmer(mean.aoa ~ morphological.complexity + (morphological.complexity|lang.family) + (1|native.country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$morphological.complexity)

# complexity bias 
tidy(lmer(mean.aoa ~ p.complexity.bias + (p.complexity.bias|lang.family) + (1|native.country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$p.complexity.bias)

# TTR **
tidy(lmer(mean.aoa ~ scaled.LDT.TTR + (scaled.LDT.TTR|lang.family) + (1|native.country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$scaled.LDT.TTR)
cor.test(d.clean$mean.aoa, d.clean$scaled.LDT.H)
cor.test(d.clean$mean.aoa, d.clean$scaled.LDT.ZM)

# dependency length
tidy(lmer(mean.aoa ~ mean.dependency.length + (mean.dependency.length|lang.family) + (1|native.country), d.clean))[2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$mean.dependency.length)

# mean length
tidy(lmer(mean.aoa ~ mean.length + (mean.length|lang.family) + (1|native.country), d.clean)) [2, "statistic"]
cor.test(d.clean$mean.aoa, d.clean$mean.length)

# n_consonants **
tidy(lmer(mean.aoa ~ n.consonants_log + (n.consonants_log|lang.family) + (1|native.country) , d.clean))[2, "statistic"]
tidy(lmer(mean.aoa ~ normalized.consonant.diversity  + (1|native.country) , d.clean)) [2, "statistic"]

# n_phonemes
tidy(lmer(mean.aoa ~ n.phonemes_log + (n.phonemes_log|lang.family) + (1|native.country) , d.clean))[2, "statistic"]

#n_vowels
tidy(lmer(mean.aoa ~ n.vowels_log + (n.vowels_log|lang.family) + (1|native.country) , d.clean))[2, "statistic"]


ggplot(d.clean, aes(mean.length, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(scaled.LDT.TTR, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(n.consonants_log, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(n.phonemes_log, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(n.vowels_log, mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")

ggplot(d.clean, aes(mean.dependency.length , mean.aoa, label = ISO)) +
  geom_point() +
  geom_label() +
  geom_smooth(method = "lm")